{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Creating a document similarity microservice for the Reuters-21578 dataset.\n",
    "\n",
    "First download the Reuters-21578 dataset in JSON format into the local folder:\n",
    "\n",
    "```bash\n",
    "git clone https://github.com/fergiemcdowall/reuters-21578-json\n",
    "```\n",
    "\n",
    "The first step will be to convert this into the default corpus format we use:\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "loaded  10377  documents\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import codecs \n",
    "import os\n",
    "\n",
    "docs = []\n",
    "for filename in os.listdir(\"reuters-21578-json/data/full\"):\n",
    "    f = open(\"reuters-21578-json/data/full/\"+filename)\n",
    "    js = json.load(f)\n",
    "    for j in js:\n",
    "        if 'topics' in j and 'body' in j:\n",
    "            d = {}\n",
    "            d[\"id\"] = j['id']\n",
    "            d[\"text\"] = j['body'].replace(\"\\n\",\"\")\n",
    "            d[\"title\"] = j['title']\n",
    "            d[\"tags\"] = \",\".join(j['topics'])\n",
    "            docs.append(d)\n",
    "print \"loaded \",len(docs),\" documents\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a gensim LSI document similarity model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/clive/tools/scikit-learn/sklearn/cross_validation.py:42: DeprecationWarning: This module has been deprecated in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n",
      "INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])\n",
      "INFO:gensim.corpora.dictionary:adding document #10000 to Dictionary(71167 unique tokens: [u'yetunspecified', u'europeancommission', u'overheadcosts', u'mdbl', u'othersecurities']...)\n",
      "INFO:gensim.corpora.dictionary:built Dictionary(73530 unique tokens: [u'yetunspecified', u'europeancommission', u'overheadcosts', u'mdbl', u'othersecurities']...) from 10377 documents (total 1255015 corpus positions)\n",
      "INFO:gensim.models.tfidfmodel:collecting document frequencies\n",
      "INFO:gensim.models.tfidfmodel:PROGRESS: processing document #0\n",
      "INFO:gensim.models.tfidfmodel:PROGRESS: processing document #10000\n",
      "INFO:gensim.models.tfidfmodel:calculating IDF weights for 10377 documents and 73529 features (752872 matrix non-zeros)\n",
      "INFO:seldon.text.docsim:Building gensim lsi model\n",
      "INFO:gensim.models.lsimodel:using serial LSI version on this node\n",
      "INFO:gensim.models.lsimodel:updating model with new documents\n",
      "INFO:gensim.models.lsimodel:preparing a new chunk of documents\n",
      "INFO:gensim.models.lsimodel:using 100 extra samples and 2 power iterations\n",
      "INFO:gensim.models.lsimodel:1st phase: constructing (73530, 200) action matrix\n",
      "INFO:gensim.models.lsimodel:orthonormalizing (73530, 200) action matrix\n",
      "INFO:gensim.models.lsimodel:2nd phase: running dense svd on (200, 10377) matrix\n",
      "INFO:gensim.models.lsimodel:computing the final decomposition\n",
      "INFO:gensim.models.lsimodel:keeping 100 factors (discarding 14.058% of energy spectrum)\n",
      "INFO:gensim.models.lsimodel:processed documents up to #10377\n",
      "INFO:gensim.models.lsimodel:topic #0(30.732): 0.685*\"vs\" + 0.365*\"cts\" + 0.245*\"shr\" + 0.244*\"mln\" + 0.240*\"loss\" + 0.227*\"net\" + 0.198*\"revs\" + 0.152*\"profit\" + 0.110*\"avg\" + 0.108*\"shrs\"\n",
      "INFO:gensim.models.lsimodel:topic #1(16.452): -0.408*\"qtly\" + -0.396*\"div\" + -0.346*\"record\" + -0.340*\"pay\" + -0.327*\"prior\" + -0.291*\"april\" + -0.280*\"cts\" + 0.225*\"loss\" + -0.150*\"march\" + 0.125*\"profit\"\n",
      "INFO:gensim.models.lsimodel:topic #2(14.137): 0.281*\"the\" + 0.181*\"billion\" + 0.166*\"to\" + 0.164*\"pct\" + 0.157*\"in\" + 0.138*\"s\" + 0.136*\"a\" + 0.124*\"dlrs\" + 0.122*\"of\" + 0.118*\"said\"\n",
      "INFO:gensim.models.lsimodel:topic #3(12.932): 0.795*\"loss\" + 0.402*\"profit\" + -0.238*\"vs\" + -0.183*\"mln\" + -0.115*\"billion\" + 0.094*\"qtly\" + 0.091*\"oper\" + -0.091*\"shr\" + -0.086*\"avg\" + -0.086*\"shrs\"\n",
      "INFO:gensim.models.lsimodel:topic #4(8.752): -0.812*\"oper\" + -0.246*\"excludes\" + -0.224*\"dlrs\" + 0.135*\"profit\" + -0.134*\"gain\" + -0.131*\"note\" + -0.116*\"share\" + -0.114*\"or\" + 0.109*\"vs\" + 0.102*\"loss\"\n",
      "INFO:gensim.similarities.docsim:starting similarity index under /tmp/gensim_index\n",
      "INFO:gensim.similarities.docsim:PROGRESS: fresh_shard size=10000\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "done\n"
     ]
    }
   ],
   "source": [
    "from  seldon.text import DocumentSimilarity,DefaultJsonCorpus\n",
    "import logging\n",
    "logger = logging.getLogger()\n",
    "logger.setLevel(logging.INFO)\n",
    "\n",
    "corpus = DefaultJsonCorpus(docs)\n",
    "ds = DocumentSimilarity(model_type='gensim_lsi')\n",
    "ds.fit(corpus)\n",
    "print \"done\"\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run accuracy tests\n",
    "\n",
    "Run a test over the document to compute average jaccard similarity to the 1-nearest neighbour for each document using the \"tags\" field of the meta data as the ground truth. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:gensim.similarities.docsim:creating matrix with 10377 documents and 100 features\n",
      "INFO:gensim.similarities.docsim:creating dense shard #0\n",
      "INFO:gensim.similarities.docsim:saving index shard to /tmp/gensim_index.0\n",
      "INFO:gensim.utils:saving MatrixSimilarity object under /tmp/gensim_index.0, separately None\n",
      "INFO:gensim.utils:loading MatrixSimilarity object from /tmp/gensim_index.0\n",
      "INFO:seldon.text.docsim:accuracy: 0.838159 time: 2 secs avg_call_time: 0.000193\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.8381591625443983"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds.score()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Run a test again but use the Annoy approximate nearest neighbour index that would have been built. Should be much faster."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:seldon.text.docsim:accuracy: 0.837167 time: 1 secs avg_call_time: 0.000096\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.8371665584656419"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds.score(approx=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run single nearest neighbour query\n",
    "Run a nearest neighbour query on a single document and print the title and tag meta data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Query doc:  SHEARSON LEHMAN UPGRADES U.S. OIL STOCKS Tagged: crude\n",
      "[(17254L, 0.5946490168571472), (17289L, 0.5946490168571472), (17359L, 0.5360889732837677), (17419L, 0.5244295299053192), (3430L, 0.5200405716896057)]\n",
      "Doc id 17254 WORLD OIL DEMAND LIKELY TO INCREASE, SUBROTO SAYS Tagged: crude\n",
      "Doc id 17289 WORLD OIL DEMAND LIKELY TO INCREASE, SUBROTO SAYS Tagged: crude\n",
      "Doc id 17359 DECLINE IN U.S. DOLLAR MAY BOOST OPEC OIL PRICE Tagged: crude\n",
      "Doc id 17419 NIPPON MINING PLANS MORE U.S. JOINT VENTURES Tagged: crude\n",
      "Doc id 3430 PICKENS SEES CONTINUED SLUMP IN WORKING RIGS Tagged: crude\n"
     ]
    }
   ],
   "source": [
    "query_doc=6023\n",
    "print \"Query doc: \",ds.get_meta(query_doc)['title'],\"Tagged:\",ds.get_meta(query_doc)['tags']\n",
    "neighbours = ds.nn(query_doc,k=5,translate_id=True,approx=True)\n",
    "print neighbours\n",
    "for (doc_id,_) in neighbours:\n",
    "    j = ds.get_meta(doc_id)\n",
    "    print \"Doc id\",doc_id,j['title'],\"Tagged:\",j['tags']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save recommender\n",
    "\n",
    "Save the recommender to the filesystem in ```reuters_recommender``` folder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:seldon.util:creating folder /tmp/recommender_tmp646137\n",
      "INFO:gensim.utils:saving Similarity object under /tmp/recommender_tmp646137/gensim_index, separately None\n",
      "INFO:seldon.fileutil:copy /tmp/recommender_tmp646137 to reuters_recommender\n",
      "INFO:seldon.fileutil:copying /tmp/recommender_tmp646137/gensim_index to reuters_recommender/gensim_index\n",
      "INFO:seldon.fileutil:copying /tmp/recommender_tmp646137/annoy_index to reuters_recommender/annoy_index\n",
      "INFO:seldon.fileutil:copying /tmp/recommender_tmp646137/gensim_index.0 to reuters_recommender/gensim_index.0\n",
      "INFO:seldon.fileutil:copying /tmp/recommender_tmp646137/rec to reuters_recommender/rec\n",
      "INFO:seldon.fileutil:copying /tmp/recommender_tmp646137/meta to reuters_recommender/meta\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "done\n"
     ]
    }
   ],
   "source": [
    "import seldon\n",
    "rw = seldon.Recommender_wrapper()\n",
    "rw.save_recommender(ds,\"reuters_recommender\")\n",
    "print \"done\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Start a microservice to serve the recommender"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:seldon.util:creating folder /tmp/recommender_tmp873368\n",
      "INFO:seldon.fileutil:copy reuters_recommender to /tmp/recommender_tmp873368\n",
      "INFO:seldon.fileutil:copying reuters_recommender/gensim_index to /tmp/recommender_tmp873368/gensim_index\n",
      "INFO:seldon.fileutil:copying reuters_recommender/annoy_index to /tmp/recommender_tmp873368/annoy_index\n",
      "INFO:seldon.fileutil:copying reuters_recommender/gensim_index.0 to /tmp/recommender_tmp873368/gensim_index.0\n",
      "INFO:seldon.fileutil:copying reuters_recommender/rec to /tmp/recommender_tmp873368/rec\n",
      "INFO:seldon.fileutil:copying reuters_recommender/meta to /tmp/recommender_tmp873368/meta\n",
      "INFO:gensim.utils:loading Similarity object from /tmp/recommender_tmp873368/gensim_index\n",
      "INFO:werkzeug: * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)\n"
     ]
    }
   ],
   "source": [
    "from seldon.microservice import Microservices\n",
    "m = Microservices()\n",
    "app = m.create_recommendation_microservice(\"reuters_recommender\")\n",
    "app.run(host=\"0.0.0.0\",port=5000,debug=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
